predicting review sentiment with a neural net
Written on
In a previous post we used scikit learn to predict customer sentiment from product reviews; in this post we'll do the same but from scratch using Numpy. First we load the reviews and one-hot encode them.
import numpy as np
import pandas as pd
import string
import sys
with open('/home/kayla/pylearn/sentiment_numpy/train.ft.txt') as file:
reviews = file.readlines()
sentence_list = reviews[0:1000]
sentences = pd.DataFrame(sentence_list,columns=['review'])
def one_hot(sentences):
# get a list of common words to exclude
from nltk.corpus import stopwords
stops = set(stopwords.words('english'))
# takes a data frame where each row is a sentence and one-hot
# encodes the words of each sentence
encoded = sentences.drop('review',axis=1)
# first remove all punctuation, make lower case, and collect the
# words
for i in range(sentences.shape[0]):
words_list = []
current_sentence = \
sentences.iloc[i,0].translate(str.maketrans('','', \
string.punctuation))
current_sentence = current_sentence.lower()
filtered = [word for word in current_sentence.split() \
if word not in stops]
words_list = words_list + filtered
words_list = list(set(words_list))
encoded.loc[i,words_list] = 1
encoded = encoded.fillna(0)
encoded = encoded.iloc[:,0:].astype('int')
label1_col = encoded.pop('label1')
label2_col = encoded.pop('label2')
encoded.insert(0,'label2',label2_col)
encoded.insert(0,'label1',label1_col)
return encoded
reviews = one_hot(sentences)
reviews.to_csv('amazon_reviews.csv',index=False)
Next, let's create training and test sets as well as set up a neural net to feed the data into.
reviews = pd.read_csv('amazon_reviews.csv')
labels = reviews[['label1','label2']]
reviews = reviews.iloc[:,2:reviews.shape[1]]
# set up the training and test sets
train_frac = 0.8
num_train = int(train_frac*reviews.shape[0])
num_test = reviews.shape[0]-num_train
Xtrain = reviews[0:num_train]
Xtest = reviews[num_train:num_train+num_test]
ytrain = labels[0:num_train]
ytest = labels[num_train:num_train+num_test]
# define activation functions and their derivatives
def activ_func(type_,layer):
if (type_==1):
return np.tanh(layer)
elif (type_==2):
return np.exp(layer)/np.exp(layer).sum()
def activ_deriv(type_,layer):
if (type_==1):
return 1.0-np.tanh(layer)**2.0
elif (type_==2):
softmax = np.exp(layer)/np.exp(layer).sum()
return softmax*(1.0-softmax)
# set up neural net weights and layers
num_layers = 6
num_input = reviews.shape[1]
num_output = 2
num_hidden = 50
layers = []
weights = []
np.random.seed(2)
layers.append(np.zeros((1,num_input)))
weight_scale = 0.2
for i in range(num_layers-1):
layer_num = i+1
layers.append(np.zeros((1,num_hidden)))
if (layer_num==1):
weights.append(weight_scale* \
np.random.random((num_input,num_hidden))- \
weight_scale/2.0)
elif (layer_num==(num_layers-1)):
weights.append(weight_scale* \
np.random.random((num_hidden,num_output))- \
weight_scale/2.0)
else:
weights.append(weight_scale* \
np.random.random((num_hidden,num_hidden))- \
weight_scale/2.0)
Now let's train the model.
max_iter = 200
err_tol = 0.2
alpha = 0.1
err = 1.e5
iter_num = 0
dropout = 1
while ((err>err_tol)&(iter_num<max_iter)):
err = 0.0
for i in range(Xtrain.shape[0]):
# forward propagation step
if (dropout==1):
dropout_mask = \
np.random.randint(0,2,(num_layers-2,num_hidden))
layers[0] = Xtrain.iloc[i].to_numpy().reshape(1,num_input)
for j in range(1,len(layers)):
if (j<(len(layers)-1)):
if (dropout==1):
layers[j] = 2.0*dropout_mask[j-1]* \
activ_func(1,np.matmul(layers[j-1], \
weights[j-1]))
else:
layers[j] = \
activ_func(1,np.matmul(layers[j-1], \
weights[j-1]))
else:
layers[j] = \
activ_func(2,np.matmul(layers[j-1],weights[j-1]))
# backprop step
# form the diagonal matrices for the derivatives of the
# activations for each layer
diag_derivs = []
for j in range(1,len(layers)):
if (j<(len(layers)-1)):
type_ = 1
if (dropout==1):
derivs = \
np.diag(dropout_mask[j-1]* \
activ_deriv(type_,layers[j]).ravel())
else:
derivs = \
np.diag(activ_deriv(type_,layers[j]).ravel())
else:
type_ = 2
derivs = \
np.diag(activ_deriv(type_,layers[j]).ravel())
diag_derivs.append(derivs)
# update the weights
correct_ans = ytrain.iloc[i,0:num_output].to_numpy(). \
reshape(1,num_output)
delta = layers[num_layers-1]-correct_ans
mat_prod = np.matmul( \
delta,diag_derivs[num_layers-2])
weights[num_layers-2] += -alpha* \
np.outer(layers[num_layers-2],mat_prod)
for j in range(0,len(weights)-1):
# update the weights behind each layer working from
# output toward input
weight_index = num_layers-j-2
diag_index = len(weights)-j-2
#print(weight_index,diag_index)
next_pair = np.matmul(weights[weight_index].T, \
diag_derivs[weight_index-1])
mat_prod = np.matmul(mat_prod,next_pair)
weights[diag_index] += -alpha* \
np.outer(layers[weight_index-1],mat_prod)
err += (delta**2.0).sum()
err /= Xtrain.shape[0]
print(iter_num+1,err)
iter_num += 1
if (iter_num<max_iter):
print('converged at iteration: ',iter_num)
print('training set error: ',err)
1 0.49980698697558884
2 0.49883427876339753
3 0.501143717639368
4 0.4961312915885812
5 0.4974398386496625
6 0.48694445301040384
7 0.47873258603941676
8 0.4627638083564027
9 0.45554736881128927
10 0.43554726678092753
11 0.4160133901300509
12 0.3905081397950724
13 0.34602970606645256
14 0.31516613543381433
15 0.26634745838598434
16 0.26187804345625254
17 0.23263605939285184
18 0.18831707208296627
converged at iteration: 18
training set error: 0.18831707208296627
Finally let's check the model's out-of-sample accuracy.
# check test set accuracy
num_correct = 0
for i in range(Xtest.shape[0]):
# forward propagation step
layers[0] = Xtest.iloc[i].to_numpy().reshape(1,num_input)
for j in range(1,len(layers)):
if (j<(len(layers)-1)):
layers[j] = \
activ_func(1,np.matmul(layers[j-1],weights[j-1]))
else:
layers[j] = \
activ_func(2,np.matmul(layers[j-1],weights[j-1]))
correct_ans = ytest.iloc[i,0:num_output].to_numpy(). \
reshape(1,num_output)
if (layers[num_layers-1].round()==correct_ans).all():
num_correct += 1
percent_correct = (num_correct/Xtest.shape[0])*100.0
print("percent test set accuracy: ",percent_correct)
percent test set accuracy: 79.0
79% accuracy on the test set isn't bad considering that we only used 1000 reviews to train the model. We kept the number low because our simple one-hot encoding module isn't optimized for handling large datasets.